View Javadoc

1   
2   /*
3    * SmartCrawler
4    *
5    * $Id: HtmlURLImpl.java,v 1.11 2005/08/08 13:23:08 vincool Exp $
6    * Copyright 2005 Davide Pozza
7    *
8    * This program is free software; you can redistribute it
9    * and/or modify it under the terms of the GNU General Public
10   * License as published by the Free Software Foundation;
11   * either version 2 of the License, or (at your option) any
12   * later version.
13   *
14   * This program is distributed in the hope that it will be
15   * useful, but WITHOUT ANY WARRANTY; without even the implied
16   * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
17   * PURPOSE. See the GNU General Public License for more
18   * details.
19   *
20   * You should have received a copy of the GNU General Public
21   * License along with this program; if not, write to the Free
22   * Software Foundation, Inc., 59 Temple Place, Suite 330,
23   * Boston, MA 02111-1307 USA
24   *
25   */
26  
27  package org.smartcrawler.extractor;
28  import java.net.URLDecoder;
29  import org.apache.commons.lang.StringUtils;
30  import org.apache.log4j.Logger;
31  import org.smartcrawler.common.SCLogger;
32  
33  
34  /***
35   *
36   *
37   * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
38   * @version <tt>$Revision: 1.11 $</tt>
39   */
40  public class HtmlURLImpl implements HtmlURL {
41  
42      private String extractedURL;
43      private String cleanedURL;
44  
45      private static Logger log = SCLogger.getLogger(HtmlURLImpl.class);
46  
47      /***
48       *
49       * @param extractedURL
50       */
51      public HtmlURLImpl(String extractedURL) {
52          this.extractedURL = extractedURL.trim();
53          //this.initialLink = initialLink;
54          this.cleanedURL = clean();
55      }
56  
57      /***
58       *
59       * @return
60       */
61      public boolean isValid() {
62          return !(
63                  cleanedURL.equals("..") ||
64                  cleanedURL.startsWith("#") ||
65                  cleanedURL.toLowerCase().startsWith("javascript:") ||
66                  cleanedURL.toLowerCase().startsWith("mailto:")
67                  );
68      }
69  
70      /***
71       *
72       * @return
73       */
74      public String getCleanedLinkAsString() {
75          return this.cleanedURL;
76      }
77  
78      /***
79       *
80       * @return
81       */
82      protected String clean() {
83          log.debug("clean(): BEGIN");
84  
85          String res = null;
86          if (extractedURL != null) {
87  
88              res = extractedURL.replace("\"", " ").trim();//remove the "
89              res = res.replace("'", " ").trim();//remove the '
90  
91              if (res.toLowerCase().endsWith("/")) {
92                  res = res.substring(0, res.length() - 1);
93              }
94              if (res.toLowerCase().startsWith("./")) {
95                  res = res.substring(2, res.length());
96              }
97  
98              //a & can be encoded as &amp;
99              res = StringUtils.replace(res, "&amp;", "&");
100             try {
101                 res = URLDecoder.decode(res,"UTF-8");
102             } catch(Exception e){}
103         }
104         log.debug("clean(): " + extractedURL + "->" + res);
105         log.debug("clean(): END");
106         return res;
107     }
108 
109     /***
110      *
111      * @return
112      */
113     public int getType() {
114         int type = -1;
115         if (this.cleanedURL.startsWith("/"))
116             type = LINK_ABSOLUTE_URI;
117         else if (this.cleanedURL.toLowerCase().startsWith("http://") || 
118                 this.cleanedURL.toLowerCase().startsWith("https://"))
119             type = LINK_ABSOLUTE_URL;
120         else
121             type = LINK_RELATIVE;
122 
123         return type;
124     }
125 }